import io, sys, re, codecs
import urllib.request, xml.dom.minidom, json

def write_file(path, s):
    with open(path, "a") as f:
        f.write(s)

# end write_file

def parse_one(s):
    ss = s.split("\n")
    s = ""
    for one in ss:
        s += one.strip()

    #write_file("node.txt", s)
    #write_file("node.txt", "\n")

    try:
        doc = xml.dom.minidom.parseString(s)
        root = doc.documentElement

        cs = root.childNodes

        #print(len(cs))
        #print(cs[0])

        sort_id = cs[0].firstChild.nodeValue

        match_type = cs[1].firstChild.nodeValue
        time1 = cs[2].firstChild.nodeValue.strip()
        time2 = cs[3].firstChild.nodeValue.strip()
        name = cs[4].firstChild
        sex = cs[5].firstChild.nodeValue
        match_name = cs[6].firstChild.nodeValue
        match_day = cs[7].firstChild.nodeValue

        pb = ""
        if len(cs[3].childNodes) > 1:
            span_node = cs[3].childNodes[1]
            pb = span_node.firstChild.nodeValue
        else:
            pb = "0"

        if len(name.childNodes) == 0:
            name = name.nodeValue
        else:
            name = name.firstChild.nodeValue

        name = name.strip()

        print("type:(%s), time1:(%s), time2:(%s) name:(%s) sex:(%s), match:(%s) day:(%s) pb:(%s)" % (match_type, time1, time2, name, sex, match_name, match_day, pb))

        mp = {}
        mp["match_type"]  = match_type
        mp["time1"] = time1
        mp["time2"] = time2
        mp["name"] = name
        mp["sex"] = sex
        mp["match_name"] = match_name
        mp["match_day"] = match_day
        mp["pb"] = pb != "0"

        js = json.dumps(mp)
        write_file("json.txt", js+"\n")

        return True
    except xml.parsers.expat.ExpatError as e:
        return False

# parse_one


def list_man(s):
    r0 = s.find("defaultTable")
    e1 = s.find("<thead>", r0)
    r0 = s.find("</thead>", e1)

    end_index = s.find("</table>", e1)

    count = 0

    while True:
        r1 = s.find("<tr>", r0)
        r2 = s.find("</tr>", r1)

        if r2 > end_index:
            break

        if parse_one(s[r1:r2+5]):
            count += 1
        else:
            break

        r0 = r2
    return count
# list_man


def get_content(url, out_s):
    f = urllib.request.urlopen(url)
    s = str(f.read(), "utf8")

    #with open(out_s+".txt", "w", encoding="utf8") as w_file:
    #    w_file.write(s)
    return s
# end get_content


def test_web():
    #url = "http://www.runchina.org.cn/portal.php?mod=score&ac=athlete&year=&sex=&age=&project=2&page=3"

    i = 2
    while True:
        print("\n\n===========\nindex:%d" % i)
        #url = "http://www.runchina.org.cn/portal.php?mod=score&ac=athlete&year=2014&sex=1&age=18-34&project=1&page=%d" % i # 半程
        url = "http://www.runchina.org.cn/portal.php?mod=score&ac=athlete&year=2014&sex=1&age=18-34&project=2&page=%d" % i

        s = get_content(url, "out")
        if list_man(s) == 0:
            print("end scan: url:(%s)" % url)

        i += 1
        #return


def test_txt():
    with codecs.open("out.txt", "r", encoding="utf-8") as f:
        s = ""
        for line in f:
            s += line.strip()

        list_man(s)

test_web()
#test_txt()
